mPFC: Checking for duplicates with dupRadar

Libraries required

library(dupRadar)
library(biomaRt)
library(parallel)
library(viridis)
Loading required package: viridisLite

Bam files (marked duplicates)

bamFiles <- list.files(path = "./input", pattern = ".bam$", full.names = T)
names(bamFiles) <- as.character(unlist(sapply(bamFiles, function(x) strsplit(x, "\\.|\\/")[[1]][4])))

Analyze duplicates

dm <- NULL
if (!file.exists("./output/analyzedDuplicates.rds")) {
    dm <- lapply(bamFiles, function(x) {
        a <- analyzeDuprates(bam = x, gtf = "./input/gencode.vM18.chr_patch_hapl_scaff.annotation.gff3", 
            stranded = 0, paired = F, threads = detectCores() - 1)
        
        # removing versions from gene IDs
        a$ID <- as.character(sapply(as.character(a$ID), function(y) strsplit(y, "\\.")[[1]][1]))
        return(a)
    })
    saveRDS(object = dm, file = "./output/analyzedDuplicates.rds")
} else {
    dm <- readRDS("./output/analyzedDuplicates.rds")
}

Plotting and interpretation

The number of reads per base assigned to a gene in an ideal RNA-Seq data set is expected to be proportional to the abundance of its transcripts in the sample. For lowly expressed genes we expect read duplication to happen rarely by chance, while for highly expressed genes - depending on the total sequencing depth - we expect read duplication to happen often.

A good way to learn if a dataset is following this trend is by relating the normalized number of counts per gene (RPK, as a quantification of the gene expression) and the fraction represented by duplicated reads.

A duprate plot (blue cloud)

for (i in seq_along(dm)) {
    name <- names(dm)[i]
    cat("\n \n")
    cat(paste("###", name))
    cat("\n \n")
    duprateExpDensPlot(DupMat = dm[[i]], pal = viridis(n = 1000), main = name)
    cat("\n \n")
}

CTRL_S1_L001

CTRL_S3_L005

CTRL_S5_L005

MSUS_S2_L005

MSUS_S4_L005

MSUS_S6_L005

Duprate Boxplot

The duprateExpBoxplot plot shows the range of the duplication rates at 5% bins (default) along the distribution of RPK gene counts. The x-axis displays the quantile of the RPK distribution, and the average RPK of the genes contained in this quantile.

for (i in seq_along(dm)) {
    name <- names(dm)[i]
    cat("\n \n")
    cat(paste("###", name))
    cat("\n \n")
    duprateExpBoxplot(DupMat = dm[[i]], main = name)
    cat("\n \n")
}

CTRL_S1_L001

CTRL_S3_L005

CTRL_S5_L005

MSUS_S2_L005

MSUS_S4_L005

MSUS_S6_L005

Read counts expression

for (i in seq_along(dm)) {
    name <- names(dm)[i]
    cat("\n \n")
    cat(paste("###", name))
    cat("\n \n")
    readcountExpBoxplot(DupMat = dm[[i]])
    cat("\n \n")
}

CTRL_S1_L001

CTRL_S3_L005

CTRL_S5_L005

MSUS_S2_L005

MSUS_S4_L005

MSUS_S6_L005

Read counts expression histogram

for (i in seq_along(dm)) {
    name <- names(dm)[i]
    cat("\n \n")
    cat(paste("###", name))
    cat("\n \n")
    expressionHist(DupMat = dm[[i]])
    cat("\n \n")
}

CTRL_S1_L001

CTRL_S3_L005

CTRL_S5_L005

MSUS_S2_L005

MSUS_S4_L005

MSUS_S6_L005

Comparison of multi-mapping RPK and uniquely-mapping RPK

for (i in seq_along(dm)) {
    name <- names(dm)[i]
    cat("\n \n")
    cat(paste("###", name))
    cat("\n \n")
    plot(log2(dm[[i]]$RPK), log2(dm[[i]]$RPKMulti), xlab = "Reads per kb (uniquely mapping reads only)", 
        ylab = "Reads per kb (all including multimappers, non-weighted)")
    cat("\n \n")
}

CTRL_S1_L001

CTRL_S3_L005

CTRL_S5_L005

MSUS_S2_L005

MSUS_S4_L005

MSUS_S6_L005

Connection between possible PCR artefacts and GC content

## set up biomart connection for mouse (needs internet connection)
ensm <- useMart("ensembl")
ensm <- useDataset("mmusculus_gene_ensembl", mart = ensm)

## get a table which has the gene GC content for the IDs that have been used to
## generate the table
tr <- getBM(attributes = c("ensembl_gene_id", "percentage_gene_gc_content"), values = TRUE, 
    mart = ensm)

## create a GC vector with IDs as element names
mgi.gc <- tr$percentage_gene_gc_content
names(mgi.gc) <- tr$ensembl_gene_id

Compare the dependence of duplication rate on expression level independently for below and above median GC genes

for (i in seq_along(dm)) {
    name <- names(dm)[i]
    cat("\n \n")
    cat(paste("###", name))
    cat("\n \n")
    
    keep <- dm[[i]]$ID %in% tr$ensembl_gene_id
    dm.gc <- dm[[i]][keep, ]
    dm.gc$gc <- mgi.gc[dm.gc$ID]
    
    par(mfrow = c(1, 2))
    
    ## below median GC genes
    duprateExpDensPlot(dm.gc[dm.gc$gc <= 45, ], main = "below median GC genes")
    
    ## above median GC genes
    duprateExpDensPlot(dm.gc[dm.gc$gc >= 45, ], main = "above median GC genes")
    cat("\n \n")
}

CTRL_S1_L001

CTRL_S3_L005

CTRL_S5_L005

MSUS_S2_L005

MSUS_S4_L005

MSUS_S6_L005

References

report::cite_packages(sessionInfo())

SessionInfo

devtools::session_info()
─ Session info ───────────────────────────────────────────────────────────────
 setting  value                       
 version  R version 3.6.2 (2019-12-12)
 os       Ubuntu 16.04.7 LTS          
 system   x86_64, linux-gnu           
 ui       X11                         
 language (EN)                        
 collate  en_US.UTF-8                 
 ctype    en_US.UTF-8                 
 tz       Europe/Zurich               
 date     2020-10-07                  

─ Packages ───────────────────────────────────────────────────────────────────
 package       * version    date       lib source                           
 AnnotationDbi   1.48.0     2019-10-29 [1] Bioconductor                     
 askpass         1.1        2019-01-13 [1] CRAN (R 3.6.1)                   
 assertthat      0.2.1      2019-03-21 [1] CRAN (R 3.6.1)                   
 backports       1.1.8      2020-06-17 [1] CRAN (R 3.6.2)                   
 Biobase         2.46.0     2019-10-29 [1] Bioconductor                     
 BiocFileCache   1.10.2     2019-11-08 [1] Bioconductor                     
 BiocGenerics    0.32.0     2019-10-29 [1] Bioconductor                     
 biomaRt       * 2.42.1     2020-03-26 [1] Bioconductor                     
 bit             4.0.4      2020-08-04 [1] CRAN (R 3.6.2)                   
 bit64           4.0.2      2020-07-30 [1] CRAN (R 3.6.2)                   
 blob            1.2.1      2020-01-20 [1] CRAN (R 3.6.2)                   
 bookdown        0.20       2020-06-23 [1] CRAN (R 3.6.2)                   
 callr           3.4.3      2020-03-28 [1] CRAN (R 3.6.2)                   
 cli             2.0.2      2020-02-28 [1] CRAN (R 3.6.2)                   
 colorspace      1.4-1      2019-03-18 [1] CRAN (R 3.6.1)                   
 crayon          1.3.4      2017-09-16 [1] CRAN (R 3.6.1)                   
 curl            4.3        2019-12-02 [1] CRAN (R 3.6.1)                   
 DBI             1.1.0      2019-12-15 [1] CRAN (R 3.6.1)                   
 dbplyr          1.4.4      2020-05-27 [1] CRAN (R 3.6.2)                   
 desc            1.2.0      2018-05-01 [1] CRAN (R 3.6.1)                   
 devtools        2.3.1      2020-07-21 [1] CRAN (R 3.6.2)                   
 digest          0.6.25     2020-02-23 [1] CRAN (R 3.6.2)                   
 dplyr           1.0.1      2020-07-31 [1] CRAN (R 3.6.2)                   
 dupRadar      * 1.16.0     2019-10-29 [1] Bioconductor                     
 ellipsis        0.3.1      2020-05-15 [1] CRAN (R 3.6.2)                   
 evaluate        0.14       2019-05-28 [1] CRAN (R 3.6.1)                   
 fansi           0.4.1      2020-01-08 [1] CRAN (R 3.6.2)                   
 formatR         1.7        2019-06-11 [1] CRAN (R 3.6.1)                   
 fs              1.5.0      2020-07-31 [1] CRAN (R 3.6.2)                   
 generics        0.0.2      2018-11-29 [1] CRAN (R 3.6.1)                   
 ggplot2         3.3.2      2020-06-19 [1] CRAN (R 3.6.2)                   
 glue            1.4.1      2020-05-13 [1] CRAN (R 3.6.2)                   
 gridExtra       2.3        2017-09-09 [1] CRAN (R 3.6.1)                   
 gtable          0.3.0      2019-03-25 [1] CRAN (R 3.6.1)                   
 hms             0.5.3      2020-01-08 [1] CRAN (R 3.6.2)                   
 htmltools       0.5.0      2020-06-16 [1] CRAN (R 3.6.2)                   
 httr            1.4.2      2020-07-20 [1] CRAN (R 3.6.2)                   
 insight         0.9.0      2020-07-20 [1] CRAN (R 3.6.2)                   
 IRanges         2.20.2     2020-01-13 [1] Bioconductor                     
 jsonlite        1.7.0      2020-06-25 [1] CRAN (R 3.6.2)                   
 KernSmooth      2.23-17    2020-04-26 [1] CRAN (R 3.6.2)                   
 knitr         * 1.29       2020-06-23 [1] CRAN (R 3.6.2)                   
 lifecycle       0.2.0      2020-03-06 [1] CRAN (R 3.6.2)                   
 magrittr        1.5        2014-11-22 [1] CRAN (R 3.6.1)                   
 memoise         1.1.0.9000 2020-05-06 [1] Github (r-lib/memoise@4aefd9f)   
 munsell         0.5.0      2018-06-12 [1] CRAN (R 3.6.1)                   
 openssl         1.4.1      2019-07-18 [1] CRAN (R 3.6.1)                   
 pillar          1.4.6      2020-07-10 [1] CRAN (R 3.6.2)                   
 pkgbuild        1.1.0      2020-07-13 [1] CRAN (R 3.6.2)                   
 pkgconfig       2.0.3      2019-09-22 [1] CRAN (R 3.6.1)                   
 pkgload         1.1.0      2020-05-29 [1] CRAN (R 3.6.2)                   
 prettyunits     1.1.1      2020-01-24 [1] CRAN (R 3.6.2)                   
 processx        3.4.3      2020-07-05 [1] CRAN (R 3.6.2)                   
 progress        1.2.2      2019-05-16 [1] CRAN (R 3.6.1)                   
 ps              1.3.3      2020-05-08 [1] CRAN (R 3.6.2)                   
 purrr           0.3.4      2020-04-17 [1] CRAN (R 3.6.2)                   
 R6              2.4.1      2019-11-12 [1] CRAN (R 3.6.1)                   
 rappdirs        0.3.1      2016-03-28 [1] CRAN (R 3.6.1)                   
 Rcpp            1.0.5      2020-07-06 [1] CRAN (R 3.6.2)                   
 remotes         2.2.0      2020-07-21 [1] CRAN (R 3.6.2)                   
 report          0.1.0      2020-03-19 [1] Github (easystats/report@dcdd283)
 rlang           0.4.7      2020-07-09 [1] CRAN (R 3.6.2)                   
 rmarkdown       2.3        2020-06-18 [1] CRAN (R 3.6.2)                   
 rmdformats    * 0.4.0      2020-06-07 [1] Github (juba/rmdformats@94cd7a3) 
 rprojroot       1.3-2      2018-01-03 [1] CRAN (R 3.6.1)                   
 RSQLite         2.1.4      2019-12-04 [1] CRAN (R 3.6.1)                   
 Rsubread        2.0.1      2020-01-27 [1] Bioconductor                     
 S4Vectors       0.24.4     2020-04-09 [1] Bioconductor                     
 scales          1.1.1      2020-05-11 [1] CRAN (R 3.6.2)                   
 sessioninfo     1.1.1      2018-11-05 [1] CRAN (R 3.6.1)                   
 stringi         1.4.6      2020-02-17 [1] CRAN (R 3.6.2)                   
 stringr         1.4.0      2019-02-10 [1] CRAN (R 3.6.1)                   
 testthat        2.3.2      2020-03-02 [1] CRAN (R 3.6.2)                   
 tibble          3.0.3      2020-07-10 [1] CRAN (R 3.6.2)                   
 tidyselect      1.1.0      2020-05-11 [1] CRAN (R 3.6.2)                   
 usethis         1.6.1      2020-04-29 [1] CRAN (R 3.6.2)                   
 vctrs           0.3.2      2020-07-15 [1] CRAN (R 3.6.2)                   
 viridis       * 0.5.1      2018-03-29 [1] CRAN (R 3.6.1)                   
 viridisLite   * 0.3.0      2018-02-01 [1] CRAN (R 3.6.1)                   
 withr           2.2.0      2020-04-20 [1] CRAN (R 3.6.2)                   
 xfun            0.16       2020-07-24 [1] CRAN (R 3.6.2)                   
 XML             3.99-0.3   2020-01-20 [1] CRAN (R 3.6.2)                   
 yaml            2.2.1      2020-02-01 [1] CRAN (R 3.6.2)                   

[1] /home/ubuntu/R/x86_64-pc-linux-gnu-library/3.6
[2] /usr/local/lib/R/site-library
[3] /usr/lib/R/site-library
[4] /usr/lib/R/library